In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
In [2]:
df = pd.read_csv('owid-covid-data.csv')
In [3]:
df
Out[3]:
iso_code continent location date total_cases new_cases new_cases_smoothed total_deaths new_deaths new_deaths_smoothed ... female_smokers male_smokers handwashing_facilities hospital_beds_per_thousand life_expectancy human_development_index excess_mortality_cumulative_absolute excess_mortality_cumulative excess_mortality excess_mortality_cumulative_per_million
0 AFG Asia Afghanistan 2020-02-24 5.0 5.0 NaN NaN NaN NaN ... NaN NaN 37.746 0.5 64.83 0.511 NaN NaN NaN NaN
1 AFG Asia Afghanistan 2020-02-25 5.0 0.0 NaN NaN NaN NaN ... NaN NaN 37.746 0.5 64.83 0.511 NaN NaN NaN NaN
2 AFG Asia Afghanistan 2020-02-26 5.0 0.0 NaN NaN NaN NaN ... NaN NaN 37.746 0.5 64.83 0.511 NaN NaN NaN NaN
3 AFG Asia Afghanistan 2020-02-27 5.0 0.0 NaN NaN NaN NaN ... NaN NaN 37.746 0.5 64.83 0.511 NaN NaN NaN NaN
4 AFG Asia Afghanistan 2020-02-28 5.0 0.0 NaN NaN NaN NaN ... NaN NaN 37.746 0.5 64.83 0.511 NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
166321 ZWE Africa Zimbabwe 2022-03-01 236871.0 491.0 413.000 5395.0 0.0 1.000 ... 1.6 30.7 36.791 1.7 61.49 0.571 NaN NaN NaN NaN
166322 ZWE Africa Zimbabwe 2022-03-02 237503.0 632.0 416.286 5396.0 1.0 1.143 ... 1.6 30.7 36.791 1.7 61.49 0.571 NaN NaN NaN NaN
166323 ZWE Africa Zimbabwe 2022-03-03 237503.0 0.0 362.286 5396.0 0.0 0.857 ... 1.6 30.7 36.791 1.7 61.49 0.571 NaN NaN NaN NaN
166324 ZWE Africa Zimbabwe 2022-03-04 238739.0 1236.0 467.429 5397.0 1.0 0.714 ... 1.6 30.7 36.791 1.7 61.49 0.571 NaN NaN NaN NaN
166325 ZWE Africa Zimbabwe 2022-03-05 239019.0 280.0 459.429 5397.0 0.0 0.571 ... 1.6 30.7 36.791 1.7 61.49 0.571 NaN NaN NaN NaN

166326 rows × 67 columns

In [4]:
df.head()
Out[4]:
iso_code continent location date total_cases new_cases new_cases_smoothed total_deaths new_deaths new_deaths_smoothed ... female_smokers male_smokers handwashing_facilities hospital_beds_per_thousand life_expectancy human_development_index excess_mortality_cumulative_absolute excess_mortality_cumulative excess_mortality excess_mortality_cumulative_per_million
0 AFG Asia Afghanistan 2020-02-24 5.0 5.0 NaN NaN NaN NaN ... NaN NaN 37.746 0.5 64.83 0.511 NaN NaN NaN NaN
1 AFG Asia Afghanistan 2020-02-25 5.0 0.0 NaN NaN NaN NaN ... NaN NaN 37.746 0.5 64.83 0.511 NaN NaN NaN NaN
2 AFG Asia Afghanistan 2020-02-26 5.0 0.0 NaN NaN NaN NaN ... NaN NaN 37.746 0.5 64.83 0.511 NaN NaN NaN NaN
3 AFG Asia Afghanistan 2020-02-27 5.0 0.0 NaN NaN NaN NaN ... NaN NaN 37.746 0.5 64.83 0.511 NaN NaN NaN NaN
4 AFG Asia Afghanistan 2020-02-28 5.0 0.0 NaN NaN NaN NaN ... NaN NaN 37.746 0.5 64.83 0.511 NaN NaN NaN NaN

5 rows × 67 columns

1 - Data Cleaning¶

1.1 - Check the missing values¶

In [5]:
df.isna().sum()
Out[5]:
iso_code                                        0
continent                                    9956
location                                        0
date                                            0
total_cases                                  3033
                                            ...  
human_development_index                     30073
excess_mortality_cumulative_absolute       160630
excess_mortality_cumulative                160630
excess_mortality                           160630
excess_mortality_cumulative_per_million    160630
Length: 67, dtype: int64

Dropping missing values >= 20000¶

In [6]:
for i in df:
    if df[i].isna().sum() >= 20000 and i != "total_deaths":
        df.drop(i, axis = 1, inplace = True)
In [7]:
df.isna().sum()
Out[7]:
iso_code                              0
continent                          9956
location                              0
date                                  0
total_cases                        3033
new_cases                          3193
new_cases_smoothed                 5176
total_deaths                      20875
total_cases_per_million            3791
new_cases_per_million              3951
new_cases_smoothed_per_million     5928
population                         1075
population_density                18398
life_expectancy                   11058
dtype: int64

Dropping null values in continent¶

In [8]:
df = df[df['continent'].notna()]
In [9]:
df.isna().sum()
Out[9]:
iso_code                              0
continent                             0
location                              0
date                                  0
total_cases                        3026
new_cases                          3193
new_cases_smoothed                 5098
total_deaths                      20662
total_cases_per_million            3026
new_cases_per_million              3193
new_cases_smoothed_per_million     5098
population                          317
population_density                 9216
life_expectancy                    1876
dtype: int64

Dropping the outliers in total_cases¶

In [10]:
df['total_cases'].describe()
Out[10]:
count    1.533440e+05
mean     6.444804e+05
std      3.202692e+06
min      1.000000e+00
25%      1.764000e+03
50%      2.042400e+04
75%      2.278665e+05
max      7.926573e+07
Name: total_cases, dtype: float64
In [11]:
df.boxplot(column = ['total_cases'])
print(len(df['total_cases']))
156370
In [12]:
q75, q25 = df['total_cases'].quantile(0.75), df['total_cases'].quantile(0.25)
iqr = q75 - q25

# Calculate the lower and upper bounds
lower_bound = q25 - (1.5 * iqr)
upper_bound = q75 + (1.5 * iqr)

df = df.drop(df[(df['total_cases'] < lower_bound) | (df['total_cases'] > upper_bound)].index)
In [13]:
df.boxplot(column = ['total_cases'])
print(len(df['total_cases']))
133613
In [14]:
df.isna().sum()
Out[14]:
iso_code                              0
continent                             0
location                              0
date                                  0
total_cases                        3026
new_cases                          3153
new_cases_smoothed                 4926
total_deaths                      20662
total_cases_per_million            3026
new_cases_per_million              3153
new_cases_smoothed_per_million     4926
population                          317
population_density                 9216
life_expectancy                    1876
dtype: int64
In [15]:
df['total_cases'].describe()
Out[15]:
count    130587.000000
mean      72515.788601
std      123395.731988
min           1.000000
25%         909.000000
50%       11358.000000
75%       84804.500000
max      566966.000000
Name: total_cases, dtype: float64

Fill missing values with the mean¶

In [16]:
df['total_cases'] = df['total_cases'].fillna(df['total_cases'].mean())
In [17]:
df.isna().sum()
Out[17]:
iso_code                              0
continent                             0
location                              0
date                                  0
total_cases                           0
new_cases                          3153
new_cases_smoothed                 4926
total_deaths                      20662
total_cases_per_million            3026
new_cases_per_million              3153
new_cases_smoothed_per_million     4926
population                          317
population_density                 9216
life_expectancy                    1876
dtype: int64
In [18]:
# dropping the smoothed values

df.drop('new_cases_smoothed', axis = 1, inplace = True)
df.drop('new_cases_smoothed_per_million', axis = 1, inplace = True)
In [19]:
df.isna().sum()
Out[19]:
iso_code                       0
continent                      0
location                       0
date                           0
total_cases                    0
new_cases                   3153
total_deaths               20662
total_cases_per_million     3026
new_cases_per_million       3153
population                   317
population_density          9216
life_expectancy             1876
dtype: int64

fill missing values in total_deathes to 0¶

In [20]:
df['total_deaths'] =  df['total_deaths'].fillna(0)
In [21]:
df.isna().sum()
Out[21]:
iso_code                      0
continent                     0
location                      0
date                          0
total_cases                   0
new_cases                  3153
total_deaths                  0
total_cases_per_million    3026
new_cases_per_million      3153
population                  317
population_density         9216
life_expectancy            1876
dtype: int64

removing the null values rows in new cases¶

In [22]:
df = df[df['new_cases'].notna()]
In [23]:
df.isna().sum()
Out[23]:
iso_code                      0
continent                     0
location                      0
date                          0
total_cases                   0
new_cases                     0
total_deaths                  0
total_cases_per_million       0
new_cases_per_million         0
population                    0
population_density         7779
life_expectancy             722
dtype: int64
In [24]:
df = df[df['population_density'].notna()]
In [25]:
df.isna().sum()
Out[25]:
iso_code                     0
continent                    0
location                     0
date                         0
total_cases                  0
new_cases                    0
total_deaths                 0
total_cases_per_million      0
new_cases_per_million        0
population                   0
population_density           0
life_expectancy            722
dtype: int64

Removing the null values in life_expetancy¶

In [26]:
df = df[df['life_expectancy'].notna()]
In [27]:
df.isna().sum()
Out[27]:
iso_code                   0
continent                  0
location                   0
date                       0
total_cases                0
new_cases                  0
total_deaths               0
total_cases_per_million    0
new_cases_per_million      0
population                 0
population_density         0
life_expectancy            0
dtype: int64

Remove outliers from the numerical data¶

In [28]:
for i in df:
    if i != 'iso_code' and i != 'continent' and i != 'location' and i != 'date':
        q75, q25 = df[i].quantile(0.75), df[i].quantile(0.25)
        iqr = q75 - q25

        # Calculate the lower and upper bounds
        lower_bound = q25 - (1.5 * iqr)
        upper_bound = q75 + (1.5 * iqr)

        df = df.drop(df[(df[i] < lower_bound) | (df[i] > upper_bound)].index)
In [29]:
df
Out[29]:
iso_code continent location date total_cases new_cases total_deaths total_cases_per_million new_cases_per_million population population_density life_expectancy
0 AFG Asia Afghanistan 2020-02-24 5.0 5.0 0.0 0.126 0.126 39835428.0 54.422 64.83
1 AFG Asia Afghanistan 2020-02-25 5.0 0.0 0.0 0.126 0.000 39835428.0 54.422 64.83
2 AFG Asia Afghanistan 2020-02-26 5.0 0.0 0.0 0.126 0.000 39835428.0 54.422 64.83
3 AFG Asia Afghanistan 2020-02-27 5.0 0.0 0.0 0.126 0.000 39835428.0 54.422 64.83
4 AFG Asia Afghanistan 2020-02-28 5.0 0.0 0.0 0.126 0.000 39835428.0 54.422 64.83
... ... ... ... ... ... ... ... ... ... ... ... ...
165892 ZWE Africa Zimbabwe 2020-12-27 13077.0 114.0 349.0 866.476 7.554 15092171.0 42.729 61.49
165893 ZWE Africa Zimbabwe 2020-12-28 13148.0 71.0 354.0 871.180 4.704 15092171.0 42.729 61.49
165894 ZWE Africa Zimbabwe 2020-12-29 13325.0 177.0 359.0 882.908 11.728 15092171.0 42.729 61.49
165896 ZWE Africa Zimbabwe 2020-12-31 13867.0 242.0 363.0 918.821 16.035 15092171.0 42.729 61.49
165897 ZWE Africa Zimbabwe 2021-01-01 14084.0 217.0 369.0 933.199 14.378 15092171.0 42.729 61.49

44369 rows × 12 columns

Export the good data to a new excel file¶

In [30]:
df.to_excel("New_data.xlsx", sheet_name = "newdata", index = False)

2 - Data Analysis¶

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
In [32]:
df = pd.read_excel('New_data.xlsx')
In [33]:
df
Out[33]:
iso_code continent location date total_cases new_cases total_deaths total_cases_per_million new_cases_per_million population population_density life_expectancy
0 AFG Asia Afghanistan 2020-02-24 5 5 0 0.126 0.126 39835428 54.422 64.83
1 AFG Asia Afghanistan 2020-02-25 5 0 0 0.126 0.000 39835428 54.422 64.83
2 AFG Asia Afghanistan 2020-02-26 5 0 0 0.126 0.000 39835428 54.422 64.83
3 AFG Asia Afghanistan 2020-02-27 5 0 0 0.126 0.000 39835428 54.422 64.83
4 AFG Asia Afghanistan 2020-02-28 5 0 0 0.126 0.000 39835428 54.422 64.83
... ... ... ... ... ... ... ... ... ... ... ... ...
44364 ZWE Africa Zimbabwe 2020-12-27 13077 114 349 866.476 7.554 15092171 42.729 61.49
44365 ZWE Africa Zimbabwe 2020-12-28 13148 71 354 871.180 4.704 15092171 42.729 61.49
44366 ZWE Africa Zimbabwe 2020-12-29 13325 177 359 882.908 11.728 15092171 42.729 61.49
44367 ZWE Africa Zimbabwe 2020-12-31 13867 242 363 918.821 16.035 15092171 42.729 61.49
44368 ZWE Africa Zimbabwe 2021-01-01 14084 217 369 933.199 14.378 15092171 42.729 61.49

44369 rows × 12 columns

2.1 Total Cases in Africa¶

In [34]:
cases= df[df.continent == 'Africa']
In [35]:
cases['total_cases'].plot.hist(figsize=(15,10));

2.2 Total deaths in Europe¶

In [36]:
deathes= df[df.continent == 'Europe']
In [37]:
death=deathes['total_deaths']
minn = death.min()
maxx = death.max()
diff = (death.max() - death.min())
print('minimum = ', minn)
print('maximum = ', maxx)
print('difference = ', diff)
minimum =  0
maximum =  861
difference =  861
In [38]:
m = death.min()
l = (death.max() - death.min())//5 # length of each interval
g1 = death[death < m+l ] # add l to m; beacause we start from m not zero.
g2=death[(death>=m+l)&(death<m+2*l)]
g3=death[(death>=m+2*l)&(death<m+3*l)]
g4=death[(death>=m+3*l)&(death<m+4*l)]
g5=death[(death>=m+4*l)]
In [39]:
print('length of each interval', l)
print( len(g1))
print( len(g2))
print( len(g3))
print( len(g4))
print( len(g5))
length of each interval 172
3413
564
93
308
119
In [40]:
groups = {'G1':len(g1), 'G2':len(g2), 'G3':len(g3), 'G4':len(g4), 'G5':len(g5)}
groups_df = pd.DataFrame(groups.values(), columns=['count'], index=groups.keys())
groups_df
Out[40]:
count
G1 3413
G2 564
G3 93
G4 308
G5 119
In [41]:
groups_df.plot(kind='barh');

2.3 5 World groups¶

In [42]:
mini = df.population_density.describe()[3]
maxx=df[df.population_density<=200000].population_density.describe()[7]
rang = (maxx - mini)/5
#x=df[(df.Income<=2*rang) & (df.Income>=rang)]
print(rang)
87.443
In [43]:
def Group(x):
    r= 87.443
    if (x >= 0) & (x < r) :
        return 'g1'
    elif (x >= r) & (x < 2*r):
        return 'g2'
    elif (x >= 2*r) & (x < 3*r):
        return 'g3'
    elif (x >= 3*r) & (x < 4*r):
        return 'g4'
    else:
        return 'g5'

#df['groups']=df.Income.apply(lambda x: Group(x))
In [44]:
df['group'] = df.population_density.apply(lambda x: Group(x))
In [45]:
df.groupby('group')[['total_deaths', 'total_cases', 'life_expectancy']].mean().plot(kind='bar')
Out[45]:
<AxesSubplot:xlabel='group'>
In [46]:
df[df.group=='g1']
Out[46]:
iso_code continent location date total_cases new_cases total_deaths total_cases_per_million new_cases_per_million population population_density life_expectancy group
0 AFG Asia Afghanistan 2020-02-24 5 5 0 0.126 0.126 39835428 54.422 64.83 g1
1 AFG Asia Afghanistan 2020-02-25 5 0 0 0.126 0.000 39835428 54.422 64.83 g1
2 AFG Asia Afghanistan 2020-02-26 5 0 0 0.126 0.000 39835428 54.422 64.83 g1
3 AFG Asia Afghanistan 2020-02-27 5 0 0 0.126 0.000 39835428 54.422 64.83 g1
4 AFG Asia Afghanistan 2020-02-28 5 0 0 0.126 0.000 39835428 54.422 64.83 g1
... ... ... ... ... ... ... ... ... ... ... ... ... ...
44364 ZWE Africa Zimbabwe 2020-12-27 13077 114 349 866.476 7.554 15092171 42.729 61.49 g1
44365 ZWE Africa Zimbabwe 2020-12-28 13148 71 354 871.180 4.704 15092171 42.729 61.49 g1
44366 ZWE Africa Zimbabwe 2020-12-29 13325 177 359 882.908 11.728 15092171 42.729 61.49 g1
44367 ZWE Africa Zimbabwe 2020-12-31 13867 242 363 918.821 16.035 15092171 42.729 61.49 g1
44368 ZWE Africa Zimbabwe 2021-01-01 14084 217 369 933.199 14.378 15092171 42.729 61.49 g1

28567 rows × 13 columns

Heatmap¶

In [124]:
plt.figure(figsize=(10, 10))  
sns.heatmap(df.corr(), annot=True)  
plt.show()
In [48]:
df
Out[48]:
iso_code continent location date total_cases new_cases total_deaths total_cases_per_million new_cases_per_million population population_density life_expectancy group
0 AFG Asia Afghanistan 2020-02-24 5 5 0 0.126 0.126 39835428 54.422 64.83 g1
1 AFG Asia Afghanistan 2020-02-25 5 0 0 0.126 0.000 39835428 54.422 64.83 g1
2 AFG Asia Afghanistan 2020-02-26 5 0 0 0.126 0.000 39835428 54.422 64.83 g1
3 AFG Asia Afghanistan 2020-02-27 5 0 0 0.126 0.000 39835428 54.422 64.83 g1
4 AFG Asia Afghanistan 2020-02-28 5 0 0 0.126 0.000 39835428 54.422 64.83 g1
... ... ... ... ... ... ... ... ... ... ... ... ... ...
44364 ZWE Africa Zimbabwe 2020-12-27 13077 114 349 866.476 7.554 15092171 42.729 61.49 g1
44365 ZWE Africa Zimbabwe 2020-12-28 13148 71 354 871.180 4.704 15092171 42.729 61.49 g1
44366 ZWE Africa Zimbabwe 2020-12-29 13325 177 359 882.908 11.728 15092171 42.729 61.49 g1
44367 ZWE Africa Zimbabwe 2020-12-31 13867 242 363 918.821 16.035 15092171 42.729 61.49 g1
44368 ZWE Africa Zimbabwe 2021-01-01 14084 217 369 933.199 14.378 15092171 42.729 61.49 g1

44369 rows × 13 columns

2.5 Asia and Africa with the Life Expectancy¶

In [49]:
df_A = df[df['continent']=='Asia']
df_E = df[df['continent']=='Africa']
In [50]:
df_A['life_expectancy'].hist(alpha=0.5);
df_E['life_expectancy'].hist(alpha=0.5);

Africa and Asia¶

In [51]:
df_A.hist(figsize=(15,15));
In [52]:
df_E.hist(figsize=(15,15));

2.6 Total deaths in the World¶

In [53]:
plt.figure(figsize=(15,5))
sns.barplot(x='continent', y='total_deaths', data=df)
plt.show()

Most countrey have Total death in The world¶

In [125]:
max_index = df['total_cases'].idxmax()
max_location = df.loc[max_index, 'location']
max_value=df['total_cases'].max()
print(max_location + ' has the maximum number of total cases, with ' + str(max_value) + ' cases.')
Uzbekistan has the maximum number of total cases, with 116421 cases.

Total deaths in South America¶

In [54]:
af= df[df.continent == 'South America']
In [55]:
plt.figure(figsize=(15,5))
sns.barplot(x='location', y='total_deaths', data=af)
plt.show()

most content in which has the most total cases¶

In [127]:
grouped_data = df.groupby('continent')
max_cases = grouped_data.max()['total_cases']
max_cases = max_cases.sort_values(ascending=False)
max_cases
Out[127]:
continent
Asia             116421
Africa           106750
South America     98665
Europe            79852
North America     72049
Oceania           71122
Name: total_cases, dtype: int64
In [ ]:
 
In [56]:
df
Out[56]:
iso_code continent location date total_cases new_cases total_deaths total_cases_per_million new_cases_per_million population population_density life_expectancy group
0 AFG Asia Afghanistan 2020-02-24 5 5 0 0.126 0.126 39835428 54.422 64.83 g1
1 AFG Asia Afghanistan 2020-02-25 5 0 0 0.126 0.000 39835428 54.422 64.83 g1
2 AFG Asia Afghanistan 2020-02-26 5 0 0 0.126 0.000 39835428 54.422 64.83 g1
3 AFG Asia Afghanistan 2020-02-27 5 0 0 0.126 0.000 39835428 54.422 64.83 g1
4 AFG Asia Afghanistan 2020-02-28 5 0 0 0.126 0.000 39835428 54.422 64.83 g1
... ... ... ... ... ... ... ... ... ... ... ... ... ...
44364 ZWE Africa Zimbabwe 2020-12-27 13077 114 349 866.476 7.554 15092171 42.729 61.49 g1
44365 ZWE Africa Zimbabwe 2020-12-28 13148 71 354 871.180 4.704 15092171 42.729 61.49 g1
44366 ZWE Africa Zimbabwe 2020-12-29 13325 177 359 882.908 11.728 15092171 42.729 61.49 g1
44367 ZWE Africa Zimbabwe 2020-12-31 13867 242 363 918.821 16.035 15092171 42.729 61.49 g1
44368 ZWE Africa Zimbabwe 2021-01-01 14084 217 369 933.199 14.378 15092171 42.729 61.49 g1

44369 rows × 13 columns

Most countrey have new cases in the world¶

In [126]:
max_indexs = df['new_cases'].idxmax()
max_locations = df.loc[max_indexs, 'location']
max_values=df['new_cases'].max()
print(max_location + ' has the maximum number of total cases, with ' + str(max_values) + ' cases.')
Uzbekistan has the maximum number of total cases, with 472 cases.

prediction of total deaths in Peru¶

In [57]:
afg= df[df.location == 'Peru']
In [58]:
X = afg['new_cases'].values
Y = afg['total_deaths'].values
In [59]:
a = sum([(x-X.mean()) * (y-Y.mean()) for x,y in zip(X,Y)]) /sum([(x-X.mean())**2 for x in X])
b= np.mean(Y)-a*np.mean(X)
In [60]:
a
Out[60]:
1.0328387466255473
In [61]:
b
Out[61]:
-1.0761540027249623
In [62]:
def y_hat(x):
    a=1.0328387466255473
    b=-1.0761540027249623

    return a*x+b
In [63]:
yhat = lambda x: a * x + b #prediction
In [64]:
plt.scatter(X,Y)
plt.plot([X.min(), X.max()], [yhat(X.min()), yhat(X.max())], color = 'g')
plt.scatter(X.mean(), Y.mean(), marker = 'D', c = 'r')
Out[64]:
<matplotlib.collections.PathCollection at 0x17aca378a30>
In [65]:
Yhat = np.array([yhat(x) for x in X])
TSS = sum([(y-np.mean(Y))**2 for y in Y])
RSS = sum([(y-y_hat(x))**2 for x,y in zip (X,Y)])
R2 = 1 - RSS/TSS
print(f'TSS is: {TSS}, RSS is: {RSS}, R2 is: {R2}')
TSS is: 356028.0, RSS is: 60733.20530727628, R2 is: 0.8294145255224974

Population density in South America¶

In [66]:
sorted_counts = af['population_density'].value_counts()#south america
gen_percentage = sorted_counts/sorted_counts[:].sum()
plt.pie(gen_percentage, labels = sorted_counts.index, startangle = 200, counterclock = False);

New Cases in Chade by date¶

In [67]:
import seaborn as sns
chad= df[df.location == 'Chad']
t = pd.DataFrame(af['new_cases'].value_counts().reset_index())
t.columns = ['date', 'new_cases']
plt.figure(figsize=(18,5))
sns.pointplot(x='date', y='new_cases', data=chad)
Out[67]:
<AxesSubplot:xlabel='date', ylabel='new_cases'>
In [68]:
def calculate_w(x,y):
    ## write your code
    x_T=x.T
    x_T_x=x_T @ x #dot product
    x_T_x_inv=np.linalg.pinv(x_T_x)
    
    x_T_y= x_T @ y
    w= x_T_x_inv @ x_T_y




    return w
In [69]:
def polynomial(x, degree):
    x_poly = []
    ## write your code
    for i in range (1,degree+1):
        x_poly.append(x**i)
    x_poly.append(np.ones(len(x)))
        
    
    
    
    #return x_poly
    return np.array(x_poly).T
In [70]:
degree=2
x_poly=polynomial(X,degree)
w=calculate_w(x_poly,Y)
w
Out[70]:
array([9.23560843e-01, 3.49167594e-04, 2.16881279e+00])
In [ ]:
 
In [71]:
df
Out[71]:
iso_code continent location date total_cases new_cases total_deaths total_cases_per_million new_cases_per_million population population_density life_expectancy group
0 AFG Asia Afghanistan 2020-02-24 5 5 0 0.126 0.126 39835428 54.422 64.83 g1
1 AFG Asia Afghanistan 2020-02-25 5 0 0 0.126 0.000 39835428 54.422 64.83 g1
2 AFG Asia Afghanistan 2020-02-26 5 0 0 0.126 0.000 39835428 54.422 64.83 g1
3 AFG Asia Afghanistan 2020-02-27 5 0 0 0.126 0.000 39835428 54.422 64.83 g1
4 AFG Asia Afghanistan 2020-02-28 5 0 0 0.126 0.000 39835428 54.422 64.83 g1
... ... ... ... ... ... ... ... ... ... ... ... ... ...
44364 ZWE Africa Zimbabwe 2020-12-27 13077 114 349 866.476 7.554 15092171 42.729 61.49 g1
44365 ZWE Africa Zimbabwe 2020-12-28 13148 71 354 871.180 4.704 15092171 42.729 61.49 g1
44366 ZWE Africa Zimbabwe 2020-12-29 13325 177 359 882.908 11.728 15092171 42.729 61.49 g1
44367 ZWE Africa Zimbabwe 2020-12-31 13867 242 363 918.821 16.035 15092171 42.729 61.49 g1
44368 ZWE Africa Zimbabwe 2021-01-01 14084 217 369 933.199 14.378 15092171 42.729 61.49 g1

44369 rows × 13 columns

Prediction of Total deaths in Angola¶

In [72]:
a= df[df.location == 'Angola']
X = a.loc[:, ['new_cases']]
y = a.loc[:, ['total_deaths']]
In [73]:
X
Out[73]:
new_cases
315 1
316 1
317 0
318 1
319 0
... ...
768 173
769 195
770 178
771 137
772 74

458 rows × 1 columns

In [74]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size = .75)
In [75]:
w1=calculate_w(X_train,y_train)
y_pred = np.matmul(X_test.values, w1)
In [76]:
plt.scatter(X_train.values, y_train.values,color='g') 

plt.plot(X_test.values, y_pred.values,color='k') 



plt.show()
In [77]:
TSS = sum([(y-np.mean(y_test.values))**2 for y in y_test.values])
RSS = sum([(yt-yp)**2 for yt,yp in zip(y_test.values,y_pred.values)])
R2 = 1 - RSS/TSS
R2[0]
Out[77]:
0.11374681205089965
In [78]:
from sklearn.metrics import r2_score
r2_score(y_test.values, y_pred)
Out[78]:
0.1137468120509002

Prediction Of total death for total cases¶

In [120]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

a=df['total_cases'].values.reshape(-1, 1)
b=df['total_deaths'].values.reshape(-1, 1)

a_train,a_test,b_train,b_test = train_test_split(a,b,test_size=0.25)
regr = LinearRegression()
regr.fit(a_train,b_train)
b_pred = regr.predict(a_test)
print("The MSE will be: ",mean_squared_error(b_test,b_pred))

print("The R squared error will be: ",r2_score(b_test,b_pred))
The MSE will be:  13616.019049141203
The R squared error will be:  0.5841192083641535
In [121]:
plt.scatter(a_train, b_train, color='orange')
plt.scatter(a_test, b_pred, color='red')
plt.xlabel('total_cases')
plt.ylabel('total_Deaths')
plt.show()
In [99]:
df2=df_A.loc[:,'new_cases':'life_expectancy']
df2
Out[99]:
new_cases total_deaths total_cases_per_million new_cases_per_million population population_density life_expectancy
0 5 0 0.126 0.126 39835428 54.422 64.83
1 0 0 0.126 0.000 39835428 54.422 64.83
2 0 0 0.126 0.000 39835428 54.422 64.83
3 0 0 0.126 0.000 39835428 54.422 64.83
4 0 0 0.126 0.000 39835428 54.422 64.83
... ... ... ... ... ... ... ...
43801 113 810 125.153 3.706 30490639 53.508 66.12
43802 84 820 127.908 2.755 30490639 53.508 66.12
43803 69 832 130.171 2.263 30490639 53.508 66.12
43804 64 851 132.270 2.099 30490639 53.508 66.12
43805 82 863 134.959 2.689 30490639 53.508 66.12

6431 rows × 7 columns

In [80]:
df_A
Out[80]:
iso_code continent location date total_cases new_cases total_deaths total_cases_per_million new_cases_per_million population population_density life_expectancy group
0 AFG Asia Afghanistan 2020-02-24 5 5 0 0.126 0.126 39835428 54.422 64.83 g1
1 AFG Asia Afghanistan 2020-02-25 5 0 0 0.126 0.000 39835428 54.422 64.83 g1
2 AFG Asia Afghanistan 2020-02-26 5 0 0 0.126 0.000 39835428 54.422 64.83 g1
3 AFG Asia Afghanistan 2020-02-27 5 0 0 0.126 0.000 39835428 54.422 64.83 g1
4 AFG Asia Afghanistan 2020-02-28 5 0 0 0.126 0.000 39835428 54.422 64.83 g1
... ... ... ... ... ... ... ... ... ... ... ... ... ...
43801 YEM Asia Yemen 2021-03-25 3816 113 810 125.153 3.706 30490639 53.508 66.12 g1
43802 YEM Asia Yemen 2021-03-26 3900 84 820 127.908 2.755 30490639 53.508 66.12 g1
43803 YEM Asia Yemen 2021-03-27 3969 69 832 130.171 2.263 30490639 53.508 66.12 g1
43804 YEM Asia Yemen 2021-03-28 4033 64 851 132.270 2.099 30490639 53.508 66.12 g1
43805 YEM Asia Yemen 2021-03-29 4115 82 863 134.959 2.689 30490639 53.508 66.12 g1

6431 rows × 13 columns

Group by Date in Africa¶

In [81]:
df_africa = df_A.groupby(["date"])[["total_cases","new_cases","total_deaths"]].sum().reset_index().sort_values("date",ascending=True).reset_index(drop=True)
df_africa
Out[81]:
date total_cases new_cases total_deaths
0 2020-01-25 5 5 0
1 2020-01-26 5 0 0
2 2020-01-27 7 2 0
3 2020-01-28 7 0 0
4 2020-01-29 14 7 0
... ... ... ... ...
766 2022-03-01 17786 0 125
767 2022-03-02 17786 0 125
768 2022-03-03 17786 0 125
769 2022-03-04 17786 0 125
770 2022-03-05 17786 0 125

771 rows × 4 columns

Total cases in Africa¶

In [82]:
plt.figure(figsize=(20,10))
plt.plot(df_africa['date'], df_africa['total_cases'],marker='o'
         ,c='r',ls='--',markersize=10)
plt.title('Evolution of Confirmed Covid-19 cases over time in Africa', fontsize=16)
plt.xlabel('Days', fontsize=16)
plt.ylabel('Confirmed cases', fontsize=16)
Out[82]:
Text(0, 0.5, 'Confirmed cases')
In [83]:
plt.figure(figsize=(20,10))
plt.plot(df_africa.index, df_africa['total_cases'])
plt.title('Evolution of Confirmed Covid-19 cases over time in Africa', fontsize=16)
plt.xlabel('Days', fontsize=16)
plt.ylabel('Confirmed cases', fontsize=16)
Out[83]:
Text(0, 0.5, 'Confirmed cases')

Total_deaths in Africa by the number of Days¶

In [84]:
plt.figure(figsize=(20,10))
plt.plot(df_africa.index, df_africa['total_deaths'])
plt.title('Evolution of Covid-19 Deaths cases over time in Africa', fontsize=16)
plt.xlabel('Days', fontsize=16)
plt.ylabel('Number of Deaths', fontsize=16)
Out[84]:
Text(0, 0.5, 'Number of Deaths')

New Cases during 20 Days of Africa¶

In [85]:
#during 20 days
plt.figure(figsize=(20,10))
plt.bar(df_africa['date'].head(20), df_africa['new_cases'].head(20))
plt.title('Evolution of Covid-19 New Cases over time in Africa', fontsize=16)
plt.xlabel('Days', fontsize=16)
plt.ylabel('New Cases', fontsize=16)
Out[85]:
Text(0, 0.5, 'New Cases')
In [86]:
import datetime
df_africa['date'] = pd.to_datetime(df_africa['date'], errors='coerce')
df_africa['Month'] = df_africa['date'].dt.month 
print(df_africa)
          date  total_cases  new_cases  total_deaths  Month
0   2020-01-25            5          5             0      1
1   2020-01-26            5          0             0      1
2   2020-01-27            7          2             0      1
3   2020-01-28            7          0             0      1
4   2020-01-29           14          7             0      1
..         ...          ...        ...           ...    ...
766 2022-03-01        17786          0           125      3
767 2022-03-02        17786          0           125      3
768 2022-03-03        17786          0           125      3
769 2022-03-04        17786          0           125      3
770 2022-03-05        17786          0           125      3

[771 rows x 5 columns]

New Cases by Months in Africa¶

In [87]:
plt.figure(figsize=(20,10))
plt.bar(df_africa['Month'], df_africa['new_cases'])
plt.title('Evolution of Covid-19 New Cases over time in Africa', fontsize=16)
plt.xlabel('Months', fontsize=16)
plt.ylabel('New Cases', fontsize=16)
Out[87]:
Text(0, 0.5, 'New Cases')

Tunisia Analysis¶

In [88]:
df_tunisia = df[df['location'] == 'Tunisia']
df_tunisia
Out[88]:
iso_code continent location date total_cases new_cases total_deaths total_cases_per_million new_cases_per_million population population_density life_expectancy group
41788 TUN Africa Tunisia 2020-03-04 1 1 0 0.084 0.084 11935764 74.228 76.7 g1
41789 TUN Africa Tunisia 2020-03-05 1 0 0 0.084 0.000 11935764 74.228 76.7 g1
41790 TUN Africa Tunisia 2020-03-06 1 0 0 0.084 0.000 11935764 74.228 76.7 g1
41791 TUN Africa Tunisia 2020-03-07 1 0 0 0.084 0.000 11935764 74.228 76.7 g1
41792 TUN Africa Tunisia 2020-03-08 2 1 0 0.168 0.084 11935764 74.228 76.7 g1
... ... ... ... ... ... ... ... ... ... ... ... ... ...
41985 TUN Africa Tunisia 2020-10-15 34790 0 512 2914.769 0.000 11935764 74.228 76.7 g1
41986 TUN Africa Tunisia 2020-10-16 34790 0 512 2914.769 0.000 11935764 74.228 76.7 g1
41987 TUN Africa Tunisia 2020-10-18 40542 0 626 3396.682 0.000 11935764 74.228 76.7 g1
41988 TUN Africa Tunisia 2020-10-22 45892 0 740 3844.915 0.000 11935764 74.228 76.7 g1
41989 TUN Africa Tunisia 2020-10-25 48799 0 819 4088.469 0.000 11935764 74.228 76.7 g1

202 rows × 13 columns

In [89]:
plt.figure(figsize=(20,10))
plt.plot(df_tunisia.index, df_tunisia['new_cases'])
df_tunisia[df_tunisia['date'] == '2021-01-03']
Out[89]:
iso_code continent location date total_cases new_cases total_deaths total_cases_per_million new_cases_per_million population population_density life_expectancy group

Hierarchical Clustering on the whole Data (World)¶

In [90]:
df2=df_A.loc[:,'new_cases':'life_expectancy']
df2
Out[90]:
new_cases total_deaths total_cases_per_million new_cases_per_million population population_density life_expectancy
0 5 0 0.126 0.126 39835428 54.422 64.83
1 0 0 0.126 0.000 39835428 54.422 64.83
2 0 0 0.126 0.000 39835428 54.422 64.83
3 0 0 0.126 0.000 39835428 54.422 64.83
4 0 0 0.126 0.000 39835428 54.422 64.83
... ... ... ... ... ... ... ...
43801 113 810 125.153 3.706 30490639 53.508 66.12
43802 84 820 127.908 2.755 30490639 53.508 66.12
43803 69 832 130.171 2.263 30490639 53.508 66.12
43804 64 851 132.270 2.099 30490639 53.508 66.12
43805 82 863 134.959 2.689 30490639 53.508 66.12

6431 rows × 7 columns

In [91]:
import scipy.cluster.hierarchy as sch

# graph size
plt.figure(1, figsize = (16 ,8))

# creating the dendrogram
dendrogram = sch.dendrogram(sch.linkage(df2, method  = "ward"))

# ploting graphabs
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')
plt.show()
In [1]:
##from sklearn.cluster import AgglomerativeClustering
##from yellowbrick.cluster import KElbowVisualizer
##model = AgglomerativeClustering() # aglomerative

# k is range of number of clusters.
##visualizer = KElbowVisualizer(model, k=(2,30), timings=False) # by7seb el inertia 
# Fit data to visualizer
##visualizer.fit(df2)
# Finalize and rend;er figure
##visualizer.show();
# best numbers of k of clusters
In [167]:
import scipy.cluster.hierarchy as sch

# size of image
plt.figure(1, figsize = (16 ,8))  
plt.grid(b=None)

# creating the dendrogram 
dend = sch.dendrogram(sch.linkage(df2, method='ward'))

# theroshold
plt.axhline(y=0.2, color='orange')

# ploting graphabs
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')
plt.show();
C:\Users\Lapshop\AppData\Local\Temp\ipykernel_10012\2204434934.py:5: MatplotlibDeprecationWarning: The 'b' parameter of grid() has been renamed 'visible' since Matplotlib 3.5; support for the old name will be dropped two minor releases later.
  plt.grid(b=None)
In [168]:
import plotly as plt
import plotly.graph_objects as go

# calling the agglomerative algorithm
model = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage ='average')
#distance_threshold = distance condition for clustering
# training the model on dataset
y_model = model.fit_predict(df2) # label of clusters for each one

# creating pandas dataframe
df['cluster'] = pd.DataFrame(y_model)

# creating scattered graph
trace1 = go.Scatter3d(
    
    # storing the variables in x, y, and z axis
    hovertext=df['cluster'],
    x= df['date'],
    y= df['total_cases'],
    z= df['total_deaths'],
    mode='markers',
     marker=dict(
        color = df['cluster'],
        size= 3,
        line=dict(
            color= df['cluster'],
            width= 12
        ),
        opacity=0.9
     )
)

# ploting graph 
data = [trace1]
layout = go.Layout(
    title= 'Clusters using Agglomerative Clustering',
    scene = dict(
            xaxis = dict(title  = 'date'),
            yaxis = dict(title  = ' total_cases'),
            zaxis = dict(title  = 'total_deaths')
        ),
    width=1024, height=512
)
fig = go.Figure(data=data, layout=layout)
plt.offline.iplot(fig)
In [123]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
X = df[['total_cases', 'total_deaths', 'population']].values
kmeans.fit(X)
predictions = kmeans.predict(X)
df['cluster'] = predictions

%matplotlib notebook

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df['total_cases'], df['total_deaths'], df['population'], c=df['cluster'])
ax.set_xlabel('Total cases')
ax.set_ylabel('Total deaths')
ax.set_zlabel('Population')

# Show the plot
plt.show()

3. Results¶

1-Africa has the most total deaths on the continent¶

2-Asia has the most total cases on the continent¶

3-Uzbekistan has the most total cases in the world with 116421 cases¶

4-Venezuela has the most total deaths in South America¶

5-Uzbekistan has the newest cases in the world with 472 cases¶

6-Prediction of Total deaths in Angola¶

7-Prediction Of total death for total cases¶

8-Total_deaths in Africa by the number of Days¶

9-New Cases during 20 Days of Africa¶

-New Cases during 20 Days of Africa¶

In [ ]:
 
In [ ]: